################################
## Script: 04_gpt_annotator_bioweapons_lists.R
## Purpose: This code enumerates the claims and subjects of each
## bioweapons doc using gpt4o. 
## Data In:
## 1) bioweapons articles
## data/bioweapons_casestudy_5_20_2024_sbert_embeddings.json
## Data Out:
## all files with the form
## data/gpt4o_annotations/annotate_list_[num].rds
## this is the gpt 4o output with annotation in message comments
## Notes:
## 1) Needs to run with an array job, with arrays 1-100
## 2) this can't really be replicated 
## without incurring OpenAI API fees
## We have saved the code as a reference in case
## replicators would like to re-run the annotation themselves,
## but otherwise replicators can proceed with the saved output.

library(openai)
library(jsonlite)


total_art <- stream_in(file("data/bioweapons_casestudy_5_20_2024_sbert_embeddings.json"))

## array
slurm_arrayid <- Sys.getenv('SLURM_ARRAY_TASK_ID')
print(slurm_arrayid)


## Set OpenAI Key
## credentials 
#Sys.setenv(
#  OPENAI_API_KEY = ''
#)


##############################
## Prompts ###################
##############################

prompt_summarize_subject <- "Ennumerate the people, places, objects, and events detailed in the following paragraph."
prompt_summarize_claim <- "Enumerate the causal, normative, descriptive, and conceptual claims detailed in the following paragraph."

###############################
### Split Articles ############
###############################

total_art <- split(total_art, 1:100)
total_art <- total_art[[as.numeric(slurm_arrayid)]]

##################################
#### Annotate Claims and Subjects#
##################################

annotation_subject <- list()
for(j in 1:nrow(total_art)){
  prompt_s <- paste0(prompt_summarize_subject, " \n**Paragraph:** ", total_art$summary[j],
                     " \n**Your List**: ")
  annotation_subject[[j]] <- create_chat_completion(model = "gpt-4o-2024-08-06",
                                                message = list(list(role = "user",
                                                                    content = prompt_s)))
}


annotation_claim <- list()
for(j in 1:nrow(total_art)){
  prompt_c <- paste0(prompt_summarize_claim, " \n**Paragraph:** ", total_art$summary[j],
                     " \n**Your List**: ")
  annotation_claim[[j]] <- create_chat_completion(model = "gpt-4o-2024-08-06",
                                                    message = list(list(role = "user",
                                                                        content = prompt_c)))
}

annotations <- list(annotation_subject,
                    annotation_claim)


filename <- paste0("data/gpt4o_annotations/annotate_lists_",
                   slurm_arrayid, ".rds")

saveRDS(annotations, filename)








